In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [5]:
comments = pd.read_csv(r'C:\Users\supri\Downloads\UScomments.csv', on_bad_lines='skip')
C:\Users\supri\AppData\Local\Temp\ipykernel_25360\2911899788.py:1: DtypeWarning: Columns (2,3) have mixed types. Specify dtype option on import or set low_memory=False. comments = pd.read_csv(r'C:\Users\supri\Downloads\UScomments.csv', on_bad_lines='skip')
In [6]:
comments.head()
Out[6]:
| video_id | comment_text | likes | replies | |
|---|---|---|---|---|
| 0 | XpVt6Z1Gjjo | Logan Paul it's yo big day ‼️‼️‼️ | 4 | 0 |
| 1 | XpVt6Z1Gjjo | I've been following you from the start of your... | 3 | 0 |
| 2 | XpVt6Z1Gjjo | Say hi to Kong and maverick for me | 3 | 0 |
| 3 | XpVt6Z1Gjjo | MY FAN . attendance | 3 | 0 |
| 4 | XpVt6Z1Gjjo | trending 😉 | 3 | 0 |
In [7]:
comments.isnull().sum()
Out[7]:
video_id 0 comment_text 26 likes 0 replies 0 dtype: int64
In [8]:
comments.dropna(inplace=True)
In [9]:
comments.isnull().sum()
Out[9]:
video_id 0 comment_text 0 likes 0 replies 0 dtype: int64
Performing Sentiment Analysis
In [10]:
!pip install textblob
Requirement already satisfied: textblob in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (0.18.0.post0) Requirement already satisfied: nltk>=3.8 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from textblob) (3.8.1) Requirement already satisfied: click in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (8.1.7) Requirement already satisfied: joblib in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (1.4.0) Requirement already satisfied: regex>=2021.8.3 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (2023.12.25) Requirement already satisfied: tqdm in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (4.66.2) Requirement already satisfied: colorama in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from click->nltk>=3.8->textblob) (0.4.6)
In [11]:
from textblob import TextBlob
In [12]:
comments.head(6)
Out[12]:
| video_id | comment_text | likes | replies | |
|---|---|---|---|---|
| 0 | XpVt6Z1Gjjo | Logan Paul it's yo big day ‼️‼️‼️ | 4 | 0 |
| 1 | XpVt6Z1Gjjo | I've been following you from the start of your... | 3 | 0 |
| 2 | XpVt6Z1Gjjo | Say hi to Kong and maverick for me | 3 | 0 |
| 3 | XpVt6Z1Gjjo | MY FAN . attendance | 3 | 0 |
| 4 | XpVt6Z1Gjjo | trending 😉 | 3 | 0 |
| 5 | XpVt6Z1Gjjo | #1 on trending AYYEEEEE | 3 | 0 |
In [13]:
TextBlob("Logan Paul it's yo big day ‼️‼️‼️").sentiment.polarity
Out[13]:
0.0
In [14]:
comments.shape
Out[14]:
(691374, 4)
In [15]:
sample_df = comments[0:1000]
In [16]:
sample_df.shape
Out[16]:
(1000, 4)
In [ ]:
In [17]:
polarity = []
for comment in comments['comment_text']:
try:
polarity.append(TextBlob(comment).sentiment.polarity)
except:
polarity.append(0)
In [18]:
len(polarity)
Out[18]:
691374
In [19]:
comments['polarity'] = polarity
In [20]:
comments.head()
Out[20]:
| video_id | comment_text | likes | replies | polarity | |
|---|---|---|---|---|---|
| 0 | XpVt6Z1Gjjo | Logan Paul it's yo big day ‼️‼️‼️ | 4 | 0 | 0.0 |
| 1 | XpVt6Z1Gjjo | I've been following you from the start of your... | 3 | 0 | 0.0 |
| 2 | XpVt6Z1Gjjo | Say hi to Kong and maverick for me | 3 | 0 | 0.0 |
| 3 | XpVt6Z1Gjjo | MY FAN . attendance | 3 | 0 | 0.0 |
| 4 | XpVt6Z1Gjjo | trending 😉 | 3 | 0 | 0.0 |
In [21]:
filter1 = comments['polarity'] == 1
In [22]:
comments_positive =comments[filter1]
In [23]:
comments_positive.head(5)
Out[23]:
| video_id | comment_text | likes | replies | polarity | |
|---|---|---|---|---|---|
| 64 | XpVt6Z1Gjjo | yu are the best | 1 | 0 | 1.0 |
| 156 | cLdxuaxaQwc | Power is the disease. Care is the cure. Keep... | 0 | 0 | 1.0 |
| 227 | WYYvHb03Eog | YAS Can't wait to get it! I just need to sell ... | 0 | 0 | 1.0 |
| 307 | sjlHnJvXdQs | This is priceless | 0 | 0 | 1.0 |
| 319 | sjlHnJvXdQs | Summed up perfectly | 0 | 0 | 1.0 |
Performing Wordcloud Analysis
In [24]:
!pip install wordcloud
Requirement already satisfied: wordcloud in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (1.9.3) Requirement already satisfied: numpy>=1.6.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (1.26.4) Requirement already satisfied: pillow in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (10.3.0) Requirement already satisfied: matplotlib in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (3.8.4) Requirement already satisfied: contourpy>=1.0.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (1.2.1) Requirement already satisfied: cycler>=0.10 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (1.4.5) Requirement already satisfied: packaging>=20.0 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (24.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
In [25]:
from wordcloud import WordCloud, STOPWORDS
In [26]:
set(STOPWORDS)
Out[26]:
{'a',
'about',
'above',
'after',
'again',
'against',
'all',
'also',
'am',
'an',
'and',
'any',
'are',
"aren't",
'as',
'at',
'be',
'because',
'been',
'before',
'being',
'below',
'between',
'both',
'but',
'by',
'can',
"can't",
'cannot',
'com',
'could',
"couldn't",
'did',
"didn't",
'do',
'does',
"doesn't",
'doing',
"don't",
'down',
'during',
'each',
'else',
'ever',
'few',
'for',
'from',
'further',
'get',
'had',
"hadn't",
'has',
"hasn't",
'have',
"haven't",
'having',
'he',
"he'd",
"he'll",
"he's",
'hence',
'her',
'here',
"here's",
'hers',
'herself',
'him',
'himself',
'his',
'how',
"how's",
'however',
'http',
'i',
"i'd",
"i'll",
"i'm",
"i've",
'if',
'in',
'into',
'is',
"isn't",
'it',
"it's",
'its',
'itself',
'just',
'k',
"let's",
'like',
'me',
'more',
'most',
"mustn't",
'my',
'myself',
'no',
'nor',
'not',
'of',
'off',
'on',
'once',
'only',
'or',
'other',
'otherwise',
'ought',
'our',
'ours',
'ourselves',
'out',
'over',
'own',
'r',
'same',
'shall',
"shan't",
'she',
"she'd",
"she'll",
"she's",
'should',
"shouldn't",
'since',
'so',
'some',
'such',
'than',
'that',
"that's",
'the',
'their',
'theirs',
'them',
'themselves',
'then',
'there',
"there's",
'therefore',
'these',
'they',
"they'd",
"they'll",
"they're",
"they've",
'this',
'those',
'through',
'to',
'too',
'under',
'until',
'up',
'very',
'was',
"wasn't",
'we',
"we'd",
"we'll",
"we're",
"we've",
'were',
"weren't",
'what',
"what's",
'when',
"when's",
'where',
"where's",
'which',
'while',
'who',
"who's",
'whom',
'why',
"why's",
'with',
"won't",
'would',
"wouldn't",
'www',
'you',
"you'd",
"you'll",
"you're",
"you've",
'your',
'yours',
'yourself',
'yourselves'}
In [27]:
comments['comment_text']
Out[27]:
0 Logan Paul it's yo big day ‼️‼️‼️
1 I've been following you from the start of your...
2 Say hi to Kong and maverick for me
3 MY FAN . attendance
4 trending 😉
...
691395 Лучшая
691396 qu'est ce que j'aimerais que tu viennes à Roan...
691397 Ven a mexico! 😍 te amo LP
691398 Islığı yeter...
691399 Kocham tą piosenkę😍❤❤❤byłam zakochana po uszy ...
Name: comment_text, Length: 691374, dtype: object
In [28]:
type(comments['comment_text'])
Out[28]:
pandas.core.series.Series
In [29]:
total_comments_positive = ' '.join(comments_positive['comment_text'])
In [30]:
wordcloud_positive = WordCloud(stopwords = set(STOPWORDS)).generate(total_comments_positive)
In [31]:
plt.imshow(wordcloud_positive)
plt.axis('off')
Out[31]:
(-0.5, 399.5, 199.5, -0.5)
In [32]:
filter2 = comments['polarity'] == -1
In [33]:
comments_negative = comments[filter2]
In [34]:
comments_negative.head(5)
Out[34]:
| video_id | comment_text | likes | replies | polarity | |
|---|---|---|---|---|---|
| 512 | 8wNr-NQImFg | BEN CARSON IS THE MAN!!!!! THEY HATE HIM CAUSE... | 0 | 0 | -1.0 |
| 562 | 8wNr-NQImFg | Well… The brain surgeon Ben Carson just proved... | 0 | 0 | -1.0 |
| 952 | Ayb_2qbZHm4 | WHY DID YOU MAKE FURRY FORCE?! SO NASTY!!! | 0 | 0 | -1.0 |
| 1371 | vu_9muoxT50 | WTF BRUH!!!!!! | 0 | 0 | -1.0 |
| 1391 | vu_9muoxT50 | cheeseus christ thats insane!!! | 0 | 0 | -1.0 |
In [35]:
total_comments_negative = ' '.join(comments_negative['comment_text'])
In [36]:
wordcloud_negative = WordCloud(stopwords = set(STOPWORDS)).generate(total_comments_negative)
In [37]:
plt.imshow(wordcloud_negative)
plt.axis('off')
Out[37]:
(-0.5, 399.5, 199.5, -0.5)
Perform Emoji Analysis
In [38]:
!pip install emoji==2.2.0
Requirement already satisfied: emoji==2.2.0 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (2.2.0)
In [39]:
import emoji
In [40]:
emoji.__version__
Out[40]:
'2.2.0'
In [41]:
comments['comment_text'].head(5)
Out[41]:
0 Logan Paul it's yo big day ‼️‼️‼️ 1 I've been following you from the start of your... 2 Say hi to Kong and maverick for me 3 MY FAN . attendance 4 trending 😉 Name: comment_text, dtype: object
In [42]:
comment = 'trending 😉'
In [43]:
[char for char in comment if char in emoji.EMOJI_DATA]
Out[43]:
['😉']
In [44]:
emoji_list = []
for comment in comments['comment_text']:
for char in comment:
if char in emoji.EMOJI_DATA:
emoji_list.append(char)
In [45]:
emoji_list[0:10]
Out[45]:
['‼', '‼', '‼', '😉', '😭', '👍', '🏻', '❤', '😍', '💋']
In [46]:
from collections import Counter
In [47]:
Counter(emoji_list).most_common(10)
Out[47]:
[('😂', 36987),
('😍', 33453),
('❤', 31119),
('🔥', 8694),
('😭', 8398),
('👏', 5719),
('😘', 5545),
('👍', 5476),
('💖', 5359),
('💕', 5147)]
In [48]:
emojis = [Counter(emoji_list).most_common(10)[i][0]for i in range(10)]
In [49]:
emojis
Out[49]:
['😂', '😍', '❤', '🔥', '😭', '👏', '😘', '👍', '💖', '💕']
In [50]:
frequencies = [Counter(emoji_list).most_common(10)[i][1]for i in range(10)]
In [51]:
frequencies
Out[51]:
[36987, 33453, 31119, 8694, 8398, 5719, 5545, 5476, 5359, 5147]
In [52]:
import plotly.graph_objs as go
from plotly.offline import iplot
In [53]:
trace = go.Bar(x=emojis, y=frequencies)
In [54]:
iplot([trace])
Collect Entire Data of Youtube: Data Collection
In [55]:
import os
In [63]:
files = os.listdir(r'D:\youtube\additional_data')
In [64]:
files
Out[64]:
['CAvideos.csv', 'CA_category_id.json', 'DEvideos.csv', 'DE_category_id.json', 'FRvideos.csv', 'FR_category_id.json', 'GBvideos.csv', 'GB_category_id.json', 'INvideos.csv', 'IN_category_id.json', 'JPvideos.csv', 'JP_category_id.json', 'KRvideos.csv', 'KR_category_id.json', 'MXvideos.csv', 'MX_category_id.json', 'RUvideos.csv', 'RU_category_id.json', 'USvideos.csv', 'US_category_id.json']
In [65]:
files_csv = [file for file in files if '.csv' in file]
In [66]:
files_csv
Out[66]:
['CAvideos.csv', 'DEvideos.csv', 'FRvideos.csv', 'GBvideos.csv', 'INvideos.csv', 'JPvideos.csv', 'KRvideos.csv', 'MXvideos.csv', 'RUvideos.csv', 'USvideos.csv']
In [67]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')
In [75]:
full_df = pd.DataFrame()
path = r'D:\youtube\additional_data'
for file in files_csv:
current_df = pd.read_csv(path+'/'+file , encoding='iso-8859-1', on_bad_lines='skip')
full_df = pd.concat([full_df, current_df] , ignore_index=True)
In [76]:
full_df.shape
Out[76]:
(375942, 16)
In [78]:
full_df[full_df.duplicated()].shape
Out[78]:
(36417, 16)
In [80]:
full_df = full_df.drop_duplicates()
In [82]:
full_df.shape
Out[82]:
(339525, 16)
How to export data into csv, json, database etc.
In [87]:
full_df[0:1000].to_csv(r'D:\youtube\Export_data/youtube_sample.csv', index=False)
In [89]:
full_df[0:1000].to_json(r'D:\youtube\Export_data/youtube_sample.json')
In [91]:
from sqlalchemy import create_engine
In [95]:
engine = create_engine(r'sqlite:///D:\youtube\Export_data/youtube_sample.sqlite')
In [96]:
full_df[0:1000].to_sql('User',con = engine,if_exists = 'append')
Out[96]:
1000
Analysing the most liked category!
In [97]:
full_df.head()
Out[97]:
| video_id | trending_date | title | channel_title | category_id | publish_time | tags | views | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | video_error_or_removed | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | n1WpP7iowLc | 17.14.11 | Eminem - Walk On Water (Audio) ft. Beyoncé | EminemVEVO | 10 | 2017-11-10T17:00:03.000Z | Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... | 17158579 | 787425 | 43420 | 125882 | https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg | False | False | False | Eminem's new track Walk on Water ft. Beyoncé ... |
| 1 | 0dBIkQ4Mz1M | 17.14.11 | PLUSH - Bad Unboxing Fan Mail | iDubbbzTV | 23 | 2017-11-13T17:00:00.000Z | plush|"bad unboxing"|"unboxing"|"fan mail"|"id... | 1014651 | 127794 | 1688 | 13030 | https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg | False | False | False | STill got a lot of packages. Probably will las... |
| 2 | 5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Le... | Rudy Mancuso | 23 | 2017-11-12T19:05:24.000Z | racist superman|"rudy"|"mancuso"|"king"|"bach"... | 3191434 | 146035 | 5339 | 8181 | https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | False | False | False | WATCH MY PREVIOUS VIDEO ⶠ\n\nSUBSCRIBE ⺠... |
| 3 | d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | 24 | 2017-11-12T18:01:41.000Z | ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... | 2095828 | 132239 | 1989 | 17518 | https://i.ytimg.com/vi/d380meD0W0M/default.jpg | False | False | False | I know it's been a while since we did this sho... |
| 4 | 2Vv-BfVoq4g | 17.14.11 | Ed Sheeran - Perfect (Official Music Video) | Ed Sheeran | 10 | 2017-11-09T11:04:14.000Z | edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... | 33523622 | 1634130 | 21082 | 85067 | https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg | False | False | False | ð§: https://ad.gt/yt-perfect\nð°: https://... |
In [98]:
full_df['category_id'].unique()
Out[98]:
array([10, 23, 24, 25, 22, 26, 1, 28, 20, 17, 29, 15, 19, 2, 27, 43, 30,
44], dtype=int64)
In [105]:
json_df = pd.read_json(r'D:\youtube\additional_data\US_category_id.json')
In [106]:
json_df
Out[106]:
| kind | etag | items | |
|---|---|---|---|
| 0 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 1 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 2 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 3 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 4 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 5 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 6 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 7 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 8 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 9 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 10 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 11 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 12 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 13 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 14 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 15 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 16 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 17 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 18 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 19 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 20 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 21 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 22 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 23 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 24 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 25 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 26 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 27 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 28 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 29 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 30 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
| 31 | youtube#videoCategoryListResponse | "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... | {'kind': 'youtube#videoCategory', 'etag': '"m2... |
In [107]:
json_df['items']
Out[107]:
0 {'kind': 'youtube#videoCategory', 'etag': '"m2...
1 {'kind': 'youtube#videoCategory', 'etag': '"m2...
2 {'kind': 'youtube#videoCategory', 'etag': '"m2...
3 {'kind': 'youtube#videoCategory', 'etag': '"m2...
4 {'kind': 'youtube#videoCategory', 'etag': '"m2...
5 {'kind': 'youtube#videoCategory', 'etag': '"m2...
6 {'kind': 'youtube#videoCategory', 'etag': '"m2...
7 {'kind': 'youtube#videoCategory', 'etag': '"m2...
8 {'kind': 'youtube#videoCategory', 'etag': '"m2...
9 {'kind': 'youtube#videoCategory', 'etag': '"m2...
10 {'kind': 'youtube#videoCategory', 'etag': '"m2...
11 {'kind': 'youtube#videoCategory', 'etag': '"m2...
12 {'kind': 'youtube#videoCategory', 'etag': '"m2...
13 {'kind': 'youtube#videoCategory', 'etag': '"m2...
14 {'kind': 'youtube#videoCategory', 'etag': '"m2...
15 {'kind': 'youtube#videoCategory', 'etag': '"m2...
16 {'kind': 'youtube#videoCategory', 'etag': '"m2...
17 {'kind': 'youtube#videoCategory', 'etag': '"m2...
18 {'kind': 'youtube#videoCategory', 'etag': '"m2...
19 {'kind': 'youtube#videoCategory', 'etag': '"m2...
20 {'kind': 'youtube#videoCategory', 'etag': '"m2...
21 {'kind': 'youtube#videoCategory', 'etag': '"m2...
22 {'kind': 'youtube#videoCategory', 'etag': '"m2...
23 {'kind': 'youtube#videoCategory', 'etag': '"m2...
24 {'kind': 'youtube#videoCategory', 'etag': '"m2...
25 {'kind': 'youtube#videoCategory', 'etag': '"m2...
26 {'kind': 'youtube#videoCategory', 'etag': '"m2...
27 {'kind': 'youtube#videoCategory', 'etag': '"m2...
28 {'kind': 'youtube#videoCategory', 'etag': '"m2...
29 {'kind': 'youtube#videoCategory', 'etag': '"m2...
30 {'kind': 'youtube#videoCategory', 'etag': '"m2...
31 {'kind': 'youtube#videoCategory', 'etag': '"m2...
Name: items, dtype: object
In [109]:
json_df['items'][0]
Out[109]:
{'kind': 'youtube#videoCategory',
'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
'id': '1',
'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
'title': 'Film & Animation',
'assignable': True}}
In [108]:
cat_dict = {}
for item in json_df['items'].values:
cat_dict[int(item['id'])] =item['snippet']['title']
In [110]:
cat_dict
Out[110]:
{1: 'Film & Animation',
2: 'Autos & Vehicles',
10: 'Music',
15: 'Pets & Animals',
17: 'Sports',
18: 'Short Movies',
19: 'Travel & Events',
20: 'Gaming',
21: 'Videoblogging',
22: 'People & Blogs',
23: 'Comedy',
24: 'Entertainment',
25: 'News & Politics',
26: 'Howto & Style',
27: 'Education',
28: 'Science & Technology',
29: 'Nonprofits & Activism',
30: 'Movies',
31: 'Anime/Animation',
32: 'Action/Adventure',
33: 'Classics',
34: 'Comedy',
35: 'Documentary',
36: 'Drama',
37: 'Family',
38: 'Foreign',
39: 'Horror',
40: 'Sci-Fi/Fantasy',
41: 'Thriller',
42: 'Shorts',
43: 'Shows',
44: 'Trailers'}
In [111]:
full_df['category_name'] = full_df['category_id'].map(cat_dict)
In [112]:
full_df.head()
Out[112]:
| video_id | trending_date | title | channel_title | category_id | publish_time | tags | views | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | video_error_or_removed | description | category_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | n1WpP7iowLc | 17.14.11 | Eminem - Walk On Water (Audio) ft. Beyoncé | EminemVEVO | 10 | 2017-11-10T17:00:03.000Z | Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... | 17158579 | 787425 | 43420 | 125882 | https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg | False | False | False | Eminem's new track Walk on Water ft. Beyoncé ... | Music |
| 1 | 0dBIkQ4Mz1M | 17.14.11 | PLUSH - Bad Unboxing Fan Mail | iDubbbzTV | 23 | 2017-11-13T17:00:00.000Z | plush|"bad unboxing"|"unboxing"|"fan mail"|"id... | 1014651 | 127794 | 1688 | 13030 | https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg | False | False | False | STill got a lot of packages. Probably will las... | Comedy |
| 2 | 5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Le... | Rudy Mancuso | 23 | 2017-11-12T19:05:24.000Z | racist superman|"rudy"|"mancuso"|"king"|"bach"... | 3191434 | 146035 | 5339 | 8181 | https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | False | False | False | WATCH MY PREVIOUS VIDEO ⶠ\n\nSUBSCRIBE ⺠... | Comedy |
| 3 | d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | 24 | 2017-11-12T18:01:41.000Z | ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... | 2095828 | 132239 | 1989 | 17518 | https://i.ytimg.com/vi/d380meD0W0M/default.jpg | False | False | False | I know it's been a while since we did this sho... | Entertainment |
| 4 | 2Vv-BfVoq4g | 17.14.11 | Ed Sheeran - Perfect (Official Music Video) | Ed Sheeran | 10 | 2017-11-09T11:04:14.000Z | edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... | 33523622 | 1634130 | 21082 | 85067 | https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg | False | False | False | ð§: https://ad.gt/yt-perfect\nð°: https://... | Music |
In [119]:
plt.figure(figsize=(12,8))
sns.boxplot(x='category_name' , y = 'likes', data = full_df)
plt.xticks(rotation='vertical')
Out[119]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [Text(0, 0, 'Music'), Text(1, 0, 'Comedy'), Text(2, 0, 'Entertainment'), Text(3, 0, 'News & Politics'), Text(4, 0, 'People & Blogs'), Text(5, 0, 'Howto & Style'), Text(6, 0, 'Film & Animation'), Text(7, 0, 'Science & Technology'), Text(8, 0, 'Gaming'), Text(9, 0, 'Sports'), Text(10, 0, 'Nonprofits & Activism'), Text(11, 0, 'Pets & Animals'), Text(12, 0, 'Travel & Events'), Text(13, 0, 'Autos & Vehicles'), Text(14, 0, 'Education'), Text(15, 0, 'Shows'), Text(16, 0, 'Movies'), Text(17, 0, 'Trailers')])
Analyse whether the audience is engaged or not!
In [121]:
full_df['like_rate']=(full_df['likes']/full_df['views'])*100
full_df['dislike_rate']=(full_df['dislikes']/full_df['views'])*100
full_df['comment_count_rate']=(full_df['comment_count']/full_df['views'])*100
In [122]:
full_df['like_rate']
Out[122]:
0 4.589104
1 12.594873
2 4.575843
3 6.309630
4 4.874563
...
375936 7.820293
375938 5.635623
375939 4.507286
375940 3.408645
375941 3.464728
Name: like_rate, Length: 339525, dtype: float64
In [123]:
full_df['dislike_rate']
Out[123]:
0 0.253051
1 0.166363
2 0.167292
3 0.094903
4 0.062887
...
375936 0.049061
375938 0.035875
375939 0.096770
375940 0.050275
375941 2.066500
Name: dislike_rate, Length: 339525, dtype: float64
In [124]:
full_df['comment_count_rate']
Out[124]:
0 0.733639
1 1.284185
2 0.256342
3 0.835851
4 0.253752
...
375936 0.758070
375938 0.369648
375939 0.374326
375940 0.231204
375941 1.404942
Name: comment_count_rate, Length: 339525, dtype: float64
In [125]:
full_df.columns
Out[125]:
Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
'thumbnail_link', 'comments_disabled', 'ratings_disabled',
'video_error_or_removed', 'description', 'category_name', 'like_rate',
'dislike_rate', 'comment_count_rate'],
dtype='object')
In [128]:
plt.figure(figsize=(8,6))
sns.boxplot(x='category_name' , y = 'like_rate', data = full_df)
plt.xticks(rotation='vertical')
plt.show()
In [129]:
sns.regplot(x='views', y ='likes', data = full_df)
Out[129]:
<Axes: xlabel='views', ylabel='likes'>
In [131]:
full_df[['views', 'likes', 'dislikes']].corr()
Out[131]:
| views | likes | dislikes | |
|---|---|---|---|
| views | 1.000000 | 0.779531 | 0.405428 |
| likes | 0.779531 | 1.000000 | 0.451809 |
| dislikes | 0.405428 | 0.451809 | 1.000000 |
In [132]:
sns.heatmap(full_df[['views', 'likes', 'dislikes']].corr(), annot = True)
Out[132]:
<Axes: >
Trending Videos on YouTube
In [133]:
full_df.head()
Out[133]:
| video_id | trending_date | title | channel_title | category_id | publish_time | tags | views | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | video_error_or_removed | description | category_name | like_rate | dislike_rate | comment_count_rate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | n1WpP7iowLc | 17.14.11 | Eminem - Walk On Water (Audio) ft. Beyoncé | EminemVEVO | 10 | 2017-11-10T17:00:03.000Z | Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... | 17158579 | 787425 | 43420 | 125882 | https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg | False | False | False | Eminem's new track Walk on Water ft. Beyoncé ... | Music | 4.589104 | 0.253051 | 0.733639 |
| 1 | 0dBIkQ4Mz1M | 17.14.11 | PLUSH - Bad Unboxing Fan Mail | iDubbbzTV | 23 | 2017-11-13T17:00:00.000Z | plush|"bad unboxing"|"unboxing"|"fan mail"|"id... | 1014651 | 127794 | 1688 | 13030 | https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg | False | False | False | STill got a lot of packages. Probably will las... | Comedy | 12.594873 | 0.166363 | 1.284185 |
| 2 | 5qpjK5DgCt4 | 17.14.11 | Racist Superman | Rudy Mancuso, King Bach & Le... | Rudy Mancuso | 23 | 2017-11-12T19:05:24.000Z | racist superman|"rudy"|"mancuso"|"king"|"bach"... | 3191434 | 146035 | 5339 | 8181 | https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | False | False | False | WATCH MY PREVIOUS VIDEO ⶠ\n\nSUBSCRIBE ⺠... | Comedy | 4.575843 | 0.167292 | 0.256342 |
| 3 | d380meD0W0M | 17.14.11 | I Dare You: GOING BALD!? | nigahiga | 24 | 2017-11-12T18:01:41.000Z | ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... | 2095828 | 132239 | 1989 | 17518 | https://i.ytimg.com/vi/d380meD0W0M/default.jpg | False | False | False | I know it's been a while since we did this sho... | Entertainment | 6.309630 | 0.094903 | 0.835851 |
| 4 | 2Vv-BfVoq4g | 17.14.11 | Ed Sheeran - Perfect (Official Music Video) | Ed Sheeran | 10 | 2017-11-09T11:04:14.000Z | edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... | 33523622 | 1634130 | 21082 | 85067 | https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg | False | False | False | ð§: https://ad.gt/yt-perfect\nð°: https://... | Music | 4.874563 | 0.062887 | 0.253752 |
In [135]:
full_df['channel_title'].value_counts()
Out[135]:
channel_title
The Late Show with Stephen Colbert 710
WWE 643
Late Night with Seth Meyers 592
TheEllenShow 555
Jimmy Kimmel Live 528
...
Daas 1
YT Industries 1
BTLV Le média complémentaire 1
Quem Sabia ? 1
Jessi Osorno 1
Name: count, Length: 37824, dtype: int64
In [139]:
cdf = full_df.groupby(['channel_title']).size().sort_values(ascending=False).reset_index()
In [140]:
cdf = cdf.rename(columns = {0:'total_videos'})
In [141]:
cdf
Out[141]:
| channel_title | total_videos | |
|---|---|---|
| 0 | The Late Show with Stephen Colbert | 710 |
| 1 | WWE | 643 |
| 2 | Late Night with Seth Meyers | 592 |
| 3 | TheEllenShow | 555 |
| 4 | Jimmy Kimmel Live | 528 |
| ... | ... | ... |
| 37819 | Kd Malts | 1 |
| 37820 | Zedan TV | 1 |
| 37821 | Kc Kelly - Rocketprenuer | 1 |
| 37822 | Kbaby | 1 |
| 37823 | Pavel Sidorik TV | 1 |
37824 rows × 2 columns
In [142]:
import plotly.express as px
In [146]:
px.bar(data_frame = cdf[0:20], x = 'channel_title', y='total_videos')
Does punctuations have impact on views , likes, dislikes?
In [147]:
full_df['title'][0]
Out[147]:
'Eminem - Walk On Water (Audio) ft. Beyoncé'
In [148]:
import string
In [150]:
string.punctuation
Out[150]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [152]:
len([char for char in full_df['title'][0] if char in string.punctuation])
Out[152]:
4
In [163]:
def punc_count(text):
return len([char for char in text if char in string.punctuation])
In [168]:
sample = full_df[0:10000]
In [169]:
sample['count_punc'] = sample['title'].apply(punc_count)
In [170]:
sample['count_punc']
Out[170]:
0 4
1 1
2 3
3 3
4 3
..
9995 6
9996 0
9997 1
9998 0
9999 6
Name: count_punc, Length: 10000, dtype: int64
In [171]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' , y = 'likes', data = sample)
plt.show()
In [172]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' , y = 'views', data = sample)
plt.show()
In [174]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' , y = 'dislikes', data = sample)
plt.show()
In [ ]: